library(tidyverse)
library(caret)
Data


Numbers divided by quartiles in order to determine the profiles of rankings that are slow and fast. The profiles are divided in four equal parts considering the number of alternatives and one quarter is taking as fast and the other considered as slow.
Storing counts in `nn`, as `n` already present in input
ℹ Use `name = "new_name"` to pick a new name.
Using alpha for a discrete variable is not advised.
# For the reggresion problem
# fitControl <- trainControl(
# method = "repeatedcv",
# number = 5,
# repeats = 2)
fitControl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE,
summaryFunction = prSummary)
Predicting execution time for profiles of a fixed size
Joining, by = c("n", "m", "id")
# Fit control para
fitControl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE,
summaryFunction = prSummary)
Training 25%-75%
# Da muy malos resultados porque está desbalanceado
# # Para n = 10
# totrain <- data_quartiles %>%
# filter(n==10) %>%
# mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
# select(starts_with("mu"), quartile)
# set.seed(123)
# trainIndex <- createDataPartition(totrain$quartile, p = .8,
# list = FALSE,
# times = 1)
# dataTrain <- totrain[ trainIndex,]
# dataTest <- totrain[-trainIndex,]
# set.seed(123)
# mclas_rf_10 <- train(
# quartile ~., data = dataTrain,
# method = "rf",
# tuneLength = 3,
# trControl = fitControl,
# metric = "AUC"
# )
# mclas_rf_10
library(ROSE)
Para n = 10 con los datos normalizados
Y en test:
pred <- predict(rf_10_rose_norm, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
confusionMatrix(data = pred, reference = dataTest$quartile, mode = "prec_recall")
Y sin normalizar:
fitControl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE,
summaryFunction = prSummary,
sampling = "up")
Error in trainControl(method = "repeatedcv", number = 5, repeats = 2, :
no se pudo encontrar la función "trainControl"
rf_10_rose
Random Forest
2240 samples
16 predictor
2 classes: 'fast', 'slow'
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 2 times)
Summary of sample sizes: 1792, 1792, 1792, 1792, 1792, 1792, ...
Resampling results across tuning parameters:
mtry AUC Precision Recall F
2 0.7875077 0.7439379 0.7243243 0.7336813
9 0.7803522 0.7403700 0.7391892 0.7394959
16 0.7768513 0.7310205 0.7436937 0.7371480
AUC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
confusionMatrix(data = pred, reference = (dataTest %>% filter(quartile == "fast"))$quartile, mode = "prec_recall")
Confusion Matrix and Statistics
Reference
Prediction fast slow
fast 125 0
slow 15 0
Accuracy : 0.8929
95% CI : (0.8294, 0.9388)
No Information Rate : 1
P-Value [Acc > NIR] : 1.0000000
Kappa : 0
Mcnemar's Test P-Value : 0.0003006
Precision : 1.0000
Recall : 0.8929
F1 : 0.9434
Prevalence : 1.0000
Detection Rate : 0.8929
Detection Prevalence : 0.8929
Balanced Accuracy : NA
'Positive' Class : fast
rose_train <- ROSE(quartile ~ ., data = totrain)$data %>%
mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
Error in ROSE(quartile ~ ., data = totrain) :
no se pudo encontrar la función "ROSE"
Comparación de las variables más importantes
(vip_mreg_rf_8_norm + vip_mreg_rf_9_norm + vip_mreg_rf_10_norm) |
(vip_mreg_rf_8 + vip_mreg_rf_9 + vip_mreg_rf_10)
totrain <- data_quartiles %>%
mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
select(starts_with("mu"), quartile) %>%
filter(n!=10)
totest <- data_quartiles %>%
mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
select(starts_with("mu"), quartile) %>%
filter(n==10)
set.seed(123)
rose_train <- ROSE(quartile ~ ., data = totrain)$data %>%
mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
print(table(rose_train$quartile))
fast slow
4146 4254
set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8,
list = FALSE,
times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest <- rose_train[-trainIndex,]
Seeking for the outliers
data_outliers_normalized <- left_join(data_normalized %>%
mutate(id = as.double(as.character(id))), outliers)
Joining, by = c("n", "m", "id")
fitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = "down")
data_outliers_normalized
totrain <- data_outliers_normalized %>%
filter(n==8) %>%
select(starts_with("mu"), outlier)
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8,
list = FALSE,
times = 1)
dataTrain <- totrain[ trainIndex,]
dataTest <- totrain[-trainIndex,]
set.seed(123)
rf_8_outlier <- train(
outlier ~., data = dataTrain, method = "rf",
tuneLength = 10,
trControl = fitControl,
metric = "ROC"
)
vip_rf_8_outlier <- vip(rf_8_outlier)
pred <- predict(rf_8_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 27 109
no 9 415
Accuracy : 0.7893
95% CI : (0.7531, 0.8224)
No Information Rate : 0.9357
P-Value [Acc > NIR] : 1
Kappa : 0.2363
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.75000
Specificity : 0.79198
Pos Pred Value : 0.19853
Neg Pred Value : 0.97877
Prevalence : 0.06429
Detection Rate : 0.04821
Detection Prevalence : 0.24286
Balanced Accuracy : 0.77099
'Positive' Class : yes
sens_rf_8_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_8_outlier, dataTest, type= "prob")
auc_rf_8_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
totrain <- data_outliers_normalized %>%
filter(n==9) %>%
select(starts_with("mu"), outlier)
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8,
list = FALSE,
times = 1)
dataTrain <- totrain[ trainIndex,]
dataTest <- totrain[-trainIndex,]
set.seed(123)
rf_9_outlier <- train(
outlier ~., data = dataTrain, method = "rf",
tuneLength = 10,
trControl = fitControl,
metric = "ROC"
)
vip_rf_9_outlier <- vip(rf_9_outlier)
pred <- predict(rf_9_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 35 99
no 9 416
Accuracy : 0.8068
95% CI : (0.7716, 0.8387)
No Information Rate : 0.9213
P-Value [Acc > NIR] : 1
Kappa : 0.3117
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.79545
Specificity : 0.80777
Pos Pred Value : 0.26119
Neg Pred Value : 0.97882
Prevalence : 0.07871
Detection Rate : 0.06261
Detection Prevalence : 0.23971
Balanced Accuracy : 0.80161
'Positive' Class : yes
sens_rf_9_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_9_outlier, dataTest, type= "prob")
auc_rf_9_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
totrain <- data_outliers_normalized %>%
filter(n==10) %>%
select(starts_with("mu"), outlier)
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8,
list = FALSE,
times = 1)
dataTrain <- totrain[ trainIndex,]
dataTest <- totrain[-trainIndex,]
set.seed(123)
rf_10_outlier <- train(
outlier ~., data = dataTrain, method = "rf",
tuneLength = 10,
trControl = fitControl,
metric = "ROC"
)
vip_rf_10_outlier <- vip(rf_10_outlier)
pred <- predict(rf_10_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 34 139
no 7 379
Accuracy : 0.7388
95% CI : (0.7003, 0.7748)
No Information Rate : 0.9267
P-Value [Acc > NIR] : 1
Kappa : 0.226
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.82927
Specificity : 0.73166
Pos Pred Value : 0.19653
Neg Pred Value : 0.98187
Prevalence : 0.07335
Detection Rate : 0.06082
Detection Prevalence : 0.30948
Balanced Accuracy : 0.78046
'Positive' Class : yes
sens_rf_10_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_10_outlier, dataTest, type= "prob")
auc_rf_10_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
vip_rf_8_outlier + vip_rf_9_outlier + vip_rf_10_outlier

totrain <- data_outliers_normalized %>%
select(starts_with("mu"), outlier)
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8,
list = FALSE,
times = 1)
dataTrain <- totrain[ trainIndex,]
dataTest <- totrain[-trainIndex,]
set.seed(123)
rf_all_outlier <- train(
#outlier ~., data = dataTrain, method = "rf",
outlier ~., data = dataTrain, method = "rf",
tuneLength = 8,
trControl = fitControl,
metric = "ROC"
)
pred <- predict(rf_all_outlier, dataTest)
postResample(pred = pred, obs = dataTest$outlier)
Accuracy Kappa
0.7641453 0.2541812
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 102 377
no 19 1181
Accuracy : 0.7641
95% CI : (0.7431, 0.7843)
No Information Rate : 0.9279
P-Value [Acc > NIR] : 1
Kappa : 0.2542
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.84298
Specificity : 0.75802
Pos Pred Value : 0.21294
Neg Pred Value : 0.98417
Prevalence : 0.07207
Detection Rate : 0.06075
Detection Prevalence : 0.28529
Balanced Accuracy : 0.80050
'Positive' Class : yes
sens_rf_all_outlier <- sensitivity(pred, dataTest$outlier)
vip(rf_all_outlier)

pred <- predict(rf_all_outlier, dataTest, type= "prob")
auc_rf_all_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
vip(rf_all_outlier,
horizontal = FALSE,
aesthetics = list(width = .5)) +
theme_bw() +
scale_x_discrete(labels = function(x) parse(text=paste0("mu[", str_remove(x, "mu"), "]"))) +
ylab("Variable\nimportance") +
theme(text=element_text(size = 12, family="Times New Roman"),
axis.title.x = element_text(margin = margin(t = 10)))
sens_rf_8_outlier
[1] 0.75
auc_rf_8_outlier
[1] 0.8313454
sens_rf_9_outlier
[1] 0.7954545
auc_rf_9_outlier
[1] 0.8796778
sens_rf_10_outlier
[1] 0.8292683
auc_rf_10_outlier
[1] 0.8204869
sens_rf_all_outlier
[1] 0.8429752
auc_rf_all_outlier
[1] 0.863586
confusionMatrix(data = pred, reference = totest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 169 811
no 38 1782
Accuracy : 0.6968
95% CI : (0.6794, 0.7138)
No Information Rate : 0.9261
P-Value [Acc > NIR] : 1
Kappa : 0.1853
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.81643
Specificity : 0.68723
Pos Pred Value : 0.17245
Neg Pred Value : 0.97912
Prevalence : 0.07393
Detection Rate : 0.06036
Detection Prevalence : 0.35000
Balanced Accuracy : 0.75183
'Positive' Class : yes
Ahora para los cuartiles
totrain <- data_quartiles_normalized %>%
filter(n!=10) %>%
select(starts_with("mu"), quartile) %>%
mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
totest <- data_quartiles_normalized %>%
filter(n==10) %>%
select(starts_with("mu"), quartile) %>%
mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
set.seed(123)
trainIndex <- createDataPartition(totrain$quartile, p = .8,
list = FALSE,
times = 1)
dataTrain <- totrain[ trainIndex,]
dataTest <- totrain[-trainIndex,]
set.seed(123)
rf_quartiles <- train(
quartile ~., data = dataTrain, method = "rf",
tuneLength = 10,
trControl = fitControl,
metric = "ROC"
)
pred <- predict(rf_quartiles, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
Accuracy Kappa
0.6839286 0.3257143
# confusionMatrix(data = pred, reference = dataTest$quartile, mode = "sens_spec")
pred <- predict(rf_quartiles, totest)
postResample(pred = pred, obs = totest$quartile)
Accuracy Kappa
0.6967857 0.3713439
confusionMatrix(data = pred, reference = totest$quartile, mode = "sens_spec")
Confusion Matrix and Statistics
Reference
Prediction fast slow
fast 576 725
slow 124 1375
Accuracy : 0.6968
95% CI : (0.6794, 0.7138)
No Information Rate : 0.75
P-Value [Acc > NIR] : 1
Kappa : 0.3713
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.8229
Specificity : 0.6548
Pos Pred Value : 0.4427
Neg Pred Value : 0.9173
Prevalence : 0.2500
Detection Rate : 0.2057
Detection Prevalence : 0.4646
Balanced Accuracy : 0.7388
'Positive' Class : fast
pred <- predict(rf_quartiles, totest, type= "prob")
AUC(pred$fast, ifelse(totest$quartile == "fast", 1, 0))
[1] 0.8010837
vip(rf_quartiles)



---
title: "Training"
output: html_notebook
---

```{r}
library(tidyverse)
library(caret)
```

# Data

```{r echo=FALSE}
ggplot(times, aes(x=as.numeric(m),y=exec_time,color=quartile,group=m)) + 
  geom_jitter(height = 0) +
  geom_hline(aes(yintercept = mean(exec_time))) +
  geom_hline(aes(yintercept = median(exec_time)), linetype = "dashed") +
  facet_wrap(~n, scales = "free_x") +
  coord_flip() +
  xlab("") + ylab("") +
  theme_light()
```

```{r}
ggplot(times, aes(exec_time)) +
  geom_boxplot() +
  facet_wrap(n~., scales = "free_x", nrow = 3, strip.position = "right")  +
  theme_bw() +
  scale_x_continuous(n.breaks = 10) +
  xlab("") + ylab("") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid.major.y = element_blank())
```

Numbers divided by quartiles in order to determine the profiles of rankings that are slow and fast. The profiles are divided in four equal parts considering the number of alternatives and one quarter is taking as fast and the other considered as slow.

```{r echo=FALSE}
ggplot(times %>% count(n,m,quartile), aes(m,nn,fill=quartile)) +
  geom_bar(aes(alpha = as.numeric(as.character(m))%%2==0),stat="identity", position="fill") +
  geom_text(aes(label=nn),position = position_fill(vjust = 0.5), angle = 90) +
  facet_grid(.~n) +
  geom_hline(yintercept = .75) +
  geom_hline(yintercept = .5) +
  geom_hline(yintercept = .25) +
  scale_alpha_discrete(range = c(0.6,1)) +
  theme_bw() +
  theme(legend.position = "none")
```
```{r}
# For the reggresion problem
# fitControl <- trainControl(
#   method = "repeatedcv",
#   number = 5,
#   repeats = 2)

fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary)
```

# Predicting execution time for profiles of a fixed size

```{r}
data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(as.character(id))), times) 
data_quartiles
```

```{r echo=FALSE}
data_quartiles_normalized <- left_join(data_normalized %>% 
                               mutate(id = as.double(as.character(id))), times) 
data_quartiles_normalized
```

```{r}
# Fit control para 
fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary)
```

Training 25%-75%

```{r}
# Da muy malos resultados porque está desbalanceado
# # Para n = 10
# totrain <- data_quartiles %>% 
#   filter(n==10) %>% 
#   mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
#   select(starts_with("mu"), quartile) 
# set.seed(123)
# trainIndex <- createDataPartition(totrain$quartile, p = .8, 
#                                   list = FALSE, 
#                                   times = 1)
# dataTrain <- totrain[ trainIndex,]
# dataTest  <- totrain[-trainIndex,]
# set.seed(123)
# mclas_rf_10 <- train(
#   quartile ~., data = dataTrain, 
#   method = "rf",
#   tuneLength = 3,
#   trControl = fitControl,
#   metric = "AUC"
# )
# mclas_rf_10
```

```{r}
library(ROSE)
```

Para n = 10 con los datos normalizados

```{r}
totrain <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
set.seed(123) 
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
# print(table(rose_train$quartile))

set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]

set.seed(123)
rf_10_rose_norm <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)

vip_rf_10_rose_norm <- vip(rf_10_rose_norm)
confusionMatrix(rf_10_rose_norm)
```


Y en test:

```{r}
pred <- predict(rf_10_rose_norm, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
confusionMatrix(data = pred, reference = dataTest$quartile, mode = "prec_recall")
```

Y sin normalizar:

```{r}
fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary,
  sampling = "up")

data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(id)), times) 

totrain <- data_quartiles %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

rf_10_rose <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)
rf_10_rose
```




```{r}
data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(id)), times) 

totrain <- data_quartiles %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123) 
rose_train <- ROSE(quartile ~ ., data  = dataTrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))


set.seed(123)
rf_10_rose <- train(
  quartile ~., data = rose_train, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)

vip_rf_10_rose <- vip(rf_10_rose)
confusionMatrix(rf_10_rose)
```

```{r}
pred <- predict(rf_10_rose, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
confusionMatrix(data = pred, reference = dataTest$quartile, mode = "prec_recall")
```






```{r}
data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(id)), times) 

totrain <- data_quartiles %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
set.seed(123) 
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
# print(table(rose_train$quartile))

set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]

set.seed(123)
rf_10_rose <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)

vip_rf_10_rose <- vip(rf_10_rose)
confusionMatrix(rf_10_rose)
```

```{r}
set.seed(123)
rpart_10_rose_norm <- train(
  quartile ~., data = dataTrain, method = "nnet",
  tuneLength = 3,
  trControl = fitControl,
  preProcess = c("center","scale"),
  metric = "AUC"
)
```




Comparación de las variables más importantes

```{r}
(vip_mreg_rf_8_norm + vip_mreg_rf_9_norm + vip_mreg_rf_10_norm) |
(vip_mreg_rf_8 + vip_mreg_rf_9 + vip_mreg_rf_10)
```










```{r}
totrain <- data_quartiles %>% 
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile) %>%
  filter(n!=10)
totest <- data_quartiles %>% 
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile) %>%
  filter(n==10)

set.seed(123)
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
print(table(rose_train$quartile))

set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]
```


# Seeking for the outliers

```{r}
data_outliers_normalized <- left_join(data_normalized %>% 
                               mutate(id = as.double(as.character(id))), outliers) 

fitControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  sampling = "down")

data_outliers_normalized
```

```{r}
totrain <- data_outliers_normalized %>% 
  filter(n==8) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_8_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_8_outlier <- vip(rf_8_outlier)
pred <- predict(rf_8_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_8_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_8_outlier, dataTest, type= "prob")
auc_rf_8_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
totrain <- data_outliers_normalized %>% 
  filter(n==9) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_9_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_9_outlier <- vip(rf_9_outlier)
pred <- predict(rf_9_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_9_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_9_outlier, dataTest, type= "prob")
auc_rf_9_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
totrain <- data_outliers_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_10_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_10_outlier <- vip(rf_10_outlier)
pred <- predict(rf_10_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_10_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_10_outlier, dataTest, type= "prob")
auc_rf_10_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
vip_rf_8_outlier + vip_rf_9_outlier + vip_rf_10_outlier
```


```{r}
totrain <- data_outliers_normalized %>% 
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_all_outlier <- train(
  #outlier ~., data = dataTrain, method = "rf",
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 8,
  trControl = fitControl,
  metric = "ROC"
)

pred <- predict(rf_all_outlier, dataTest)
postResample(pred = pred, obs = dataTest$outlier)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_all_outlier <- sensitivity(pred, dataTest$outlier)

vip(rf_all_outlier)

pred <- predict(rf_all_outlier, dataTest, type= "prob")
auc_rf_all_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
vip(rf_all_outlier, 
    horizontal = FALSE,
    aesthetics = list(width = .5)) +
  theme_bw() + 
  scale_x_discrete(labels = function(x) parse(text=paste0("mu[", str_remove(x, "mu"), "]"))) +
  ylab("Variable\nimportance") +
  theme(text=element_text(size = 12, family="Times New Roman"),
        axis.title.x = element_text(margin = margin(t = 10)))
  
```


```{r}
sens_rf_8_outlier
auc_rf_8_outlier
sens_rf_9_outlier
auc_rf_9_outlier
sens_rf_10_outlier
auc_rf_10_outlier
sens_rf_all_outlier
auc_rf_all_outlier
```


```{r}
# fitControl <- trainControl(
#   method = "repeatedcv",
#   number = 3,
#   repeats = 5,
#   classProbs = TRUE,
#   summaryFunction = twoClassSummary,
#   sampling = "down")

totrain <- data_outliers_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), outlier)

totest <- data_outliers_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), outlier) 
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_all_outlier2 <- train(
  outlier ~., data = dataTrain, method = "rf", # 85 with rpart2
  tuneLength = 8,
  trControl = fitControl,
  metric = "ROC"
)

pred <- predict(rf_all_outlier2, dataTest)
postResample(pred = pred, obs = dataTest$outlier)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
pred <- predict(rf_all_outlier2, totest)
postResample(pred = pred, obs = totest$outlier)
confusionMatrix(data = pred, reference = totest$outlier, mode = "sens_spec")


pred <- predict(rf_all_outlier2, totest, type= "prob")
AUC(pred$yes, ifelse(totest$outlier == "yes", 1, 0))

vip(rf_all_outlier2)
```


# Ahora para los cuartiles


```{r}
totrain <- data_quartiles_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))

totest <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
  
set.seed(123)
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_quartiles <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
) 

pred <- predict(rf_quartiles, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
# confusionMatrix(data = pred, reference = dataTest$quartile, mode = "sens_spec")
pred <- predict(rf_quartiles, totest)
postResample(pred = pred, obs = totest$quartile)
confusionMatrix(data = pred, reference = totest$quartile, mode = "sens_spec")

pred <- predict(rf_quartiles, totest, type= "prob")
AUC(pred$fast, ifelse(totest$quartile == "fast", 1, 0))

vip(rf_quartiles)
```

```{r}
totrain <- data_quartiles_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))

totest <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
  
set.seed(123)
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_quartiles2 <- train(
  quartile ~., data = dataTrain, method = "rpart2",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
) 

pred <- predict(rf_quartiles2, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
# confusionMatrix(data = pred, reference = dataTest$quartile, mode = "sens_spec")
pred <- predict(rf_quartiles2, totest)
postResample(pred = pred, obs = totest$quartile)
confusionMatrix(data = pred, reference = totest$quartile, mode = "sens_spec")

pred <- predict(rf_quartiles2, totest, type= "prob")
AUC(pred$fast, ifelse(totest$quartile == "fast", 1, 0))

vip(rf_quartiles2)
```

```{r}
vip(rf_quartiles2, 
    num_features = 5,
    aesthetics = list(width = .5)) +
  theme_bw() + 
  scale_x_discrete(labels = function(x) parse(text=paste0("mu[", str_remove(x, "mu"), "]"))) +
  ylab("Variable importance") +
  theme(text=element_text(size = 12, family="Times New Roman"),
        axis.title.x = element_text(margin = margin(t = 10)))
```

